### installing the tidyverse
#install.packages("tidyverse")
#use above code or use "Packages" in bottom right window. Module 1 Tidyverese Package and Leaflet
Tidyverse
This module will discuss some important functions in the Tidyverse package to manipulate data. The tidyverse is a set of R packages that include many different commands for managing data.
Installing Tidyverse
###loading the tidyverse package
library(tidyverse)MathSS <- read_csv("MATStudent_Survey.csv") # this is not read.csvTibble and Data frame
Difference between Tibble and Data frame
“read_csv”produces a tibble rather than a dataframe.
class(MathSS)[1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
dim(MathSS)[1] 254 92
#vignette("tibble")## Info about Tibble##### tibble and dataframe
MathSS_df <- as.data.frame(MathSS)
#head(MathSS_df) #use head function to check the first few rows of the dataset
MathSS_tibble <- as_tibble(MathSS_df)
#head(MathSS_tibble)
#View(MathSS)
####drop rows with missing data
#MathSS <- drop_na(MathSS)Filtering data (filter function)
Example:
Selecting only female students’ details from dataset
The Gender variable in the dataset has following gender categories.
table(MathSS$GENDER)# full dataset with all the types of gender
Female Male Other
153 97 4
But using filter function we can choose only Female students.
#1
women <- filter(MathSS,GENDER=="Female") Use table function to verify “women” dataset has only women
table(women$GENDER) # only females
Female
153
- Selecting only age>16 female students details from dataset
We also can use filter function to create subsets with different categories.
#2
Age_women <- filter(MathSS,GENDER=="Female" & AGE>16) # age>16 and females
print(Age_women[,5:10], n=10)# A tibble: 142 × 6
LOCATION GENDER GENDER_2 AGE SMOKE HEIGHT_2
<chr> <chr> <dbl> <dbl> <chr> <dbl>
1 Clifton Campus Female 1 17 No 66
2 Clifton Campus Female 1 35 No 62
3 Clifton Campus Female 1 18 No 66
4 Clifton Campus Female 1 24 No 69
5 Clifton Campus Female 1 22 No 67.5
6 Clifton Campus Female 1 19 No 65
7 Clifton Campus Female 1 44 Yes 66
8 Clifton Campus Female 1 30 No 66
9 Clifton Campus Female 1 40 No 66
10 Clifton Campus Female 1 22 No 66
# ℹ 132 more rows
Age_women dataset has female students whose age is 16 or above.
Select certain columns from the dataset (select function)
Example:
Selecting only Location, Season, State varaibles from Age_women dataset
####select certain columns from the data
Age_women_select<-select(Age_women,"LOCATION","SEASON","STATE") # selecting data from Age_women data
print(Age_women_select, n=10)# A tibble: 142 × 3
LOCATION SEASON STATE
<chr> <chr> <chr>
1 Clifton Campus Summer Ohio
2 Clifton Campus Summer Ohio
3 Clifton Campus Summer Ohio
4 Clifton Campus Summer Ohio
5 Clifton Campus Winter Ohio
6 Clifton Campus Winter Ohio
7 Clifton Campus Winter Ohio
8 Clifton Campus Fall Ohio
9 Clifton Campus Fall Ohio
10 Clifton Campus Fall Ohio
# ℹ 132 more rows
Combine multiple commands using piping (%>%)
####combine multiple commands using piping
#x %>% f(y) is the same as f(x, y)
Age_women_location <- MathSS %>% filter(GENDER=="Female" & AGE>16) %>% select("LOCATION","SEASON","STATE")
print(Age_women_location, n=10)# A tibble: 142 × 3
LOCATION SEASON STATE
<chr> <chr> <chr>
1 Clifton Campus Summer Ohio
2 Clifton Campus Summer Ohio
3 Clifton Campus Summer Ohio
4 Clifton Campus Summer Ohio
5 Clifton Campus Winter Ohio
6 Clifton Campus Winter Ohio
7 Clifton Campus Winter Ohio
8 Clifton Campus Fall Ohio
9 Clifton Campus Fall Ohio
10 Clifton Campus Fall Ohio
# ℹ 132 more rows
Add grouping to data (group_by function)
Example:
Suppose we want to group the Age_women_location dataset according to seasons The Age_women_location dataset as below.
print(Age_women_location, n=10)# A tibble: 142 × 3
LOCATION SEASON STATE
<chr> <chr> <chr>
1 Clifton Campus Summer Ohio
2 Clifton Campus Summer Ohio
3 Clifton Campus Summer Ohio
4 Clifton Campus Summer Ohio
5 Clifton Campus Winter Ohio
6 Clifton Campus Winter Ohio
7 Clifton Campus Winter Ohio
8 Clifton Campus Fall Ohio
9 Clifton Campus Fall Ohio
10 Clifton Campus Fall Ohio
# ℹ 132 more rows
##### add grouping to data
grouped_season <- group_by(Age_women_location, SEASON)
print(grouped_season, n=10)# A tibble: 142 × 3
# Groups: SEASON [4]
LOCATION SEASON STATE
<chr> <chr> <chr>
1 Clifton Campus Summer Ohio
2 Clifton Campus Summer Ohio
3 Clifton Campus Summer Ohio
4 Clifton Campus Summer Ohio
5 Clifton Campus Winter Ohio
6 Clifton Campus Winter Ohio
7 Clifton Campus Winter Ohio
8 Clifton Campus Fall Ohio
9 Clifton Campus Fall Ohio
10 Clifton Campus Fall Ohio
# ℹ 132 more rows
#with piping
#grouped_SEASON <- Age_women_location %>% group_by(SEASON)
#head(grouped_SEASON)
###remove grouping with ungroup
#ungroup(grouped_SEASON)Summarize the data (summarise function)
Example:
- Calculating mean age and height for the MathSS dataset.
##### summarize the data
summarise(MathSS,mean_age=mean(AGE),mean_height=mean(HEIGHT_2)) # will not see mean Age because the data contain NA's and can not calculate mean age. # A tibble: 1 × 2
mean_age mean_height
<dbl> <dbl>
1 NA 67.1
summarise(MathSS,mean_age=mean(AGE, na.rm=TRUE),mean_height=mean(HEIGHT_2))# A tibble: 1 × 2
mean_age mean_height
<dbl> <dbl>
1 23.9 67.1
## with piping
#MathSS %>% summarise(mean_age=mean(AGE, na.rm=TRUE),mean_HEIGHT_2=mean(HEIGHT_2))- Calculating mean age and height for the MathSS dataset by gender
##summarising grouped data (summaries by group)
grouped_GENDER <- MathSS %>% group_by(GENDER)
summarise(grouped_GENDER,mean_age=mean(AGE, na.rm=TRUE),mean_height=mean(HEIGHT_2))# A tibble: 3 × 3
GENDER mean_age mean_height
<chr> <dbl> <dbl>
1 Female 24.5 64.8
2 Male 23.2 70.6
3 Other 17.8 68
## with piping
#MathSS %>% group_by(GENDER) %>% summarise(mean_age=mean(AGE, na.rm=TRUE),mean_height=mean(HEIGHT_2))Leaflet
we use the function leaflet() to initialize a map widget. This will simply load a gray panel void of any geographic features.
library(leaflet)
leaflet()Basemaps
Default (OpenStreetMap) Tiles
Before we define our areas of interest and layer on data, the map widget needs a “basemap” to serve as its foundation. Basemaps consist of map tiles.
Map tiles: individual map sections that join together to form a composite picture. We add a basemap to the map widget with the addTiles() function. With no arguments; by default, OpenStreetMap (https://www.openstreetmap.org/#map=4/38.01/-95.84) tiles are used.
leaflet() %>% addTiles() The setView() function allows a user to specify a longitude (east-west) and latitude (north-south) to serve as the initial center of the map.
leaflet() %>% addTiles() %>% setView(lng = -71.0589, lat = 42.3601, zoom = 12)Third-Party Tiles
Many popular free third-party basemaps can be added using the addProviderTiles()
names(providers)[1:5] #first 5 providers in the list[1] "OpenStreetMap" "OpenStreetMap.Mapnik" "OpenStreetMap.DE"
[4] "OpenStreetMap.CH" "OpenStreetMap.France"
leaflet() %>% setView(lng = -71.0589, lat = 42.3601, zoom = 12) %>% addProviderTiles(providers$CartoDB.Positron)Markers
Use markers to call out points on the map. Marker locations are expressed in latitude/longitude coordinates, and can either appear as icons or as circles.
Icon Markers
Icon markers are added using the addMarkers or the addAwesomeMarkers functions. Their default appearance is a dropped pin. As with most layer functions, the popup argument can be used to add a message to be displayed on click, and the label option can be used to display a text label either on hover or statically.
The data set give the locations of 1000 seismic events of MB > 4.0. The events occurred in a cube near Fiji since 1964.
A data frame with 1000 observations on 5 variables.
lat: numeric Latitude of event
long: numeric Longitude
depth: numeric Depth (km)
mag: numeric Richter Magnitude
stations: numeric Number of stations reporting
data(quakes)
head(quakes) lat long depth mag stations
1 -20.42 181.62 562 4.8 41
2 -20.62 181.03 650 4.2 15
3 -26.00 184.10 42 5.4 43
4 -17.97 181.66 626 4.1 19
5 -20.42 181.96 649 4.0 11
6 -19.68 184.31 195 4.0 12
# Show first 10 rows from the `quakes` dataset
leaflet(quakes[1:10,]) %>% addTiles() %>% addMarkers(~long, ~lat, label = ~as.character(mag))You can provide custom markers in one of several ways, depending on the scenario. For each of these ways, the icon can be provided as either a URL or as a file path. Check readings.
Marker Clusters
When there are a large number of markers on a map, you can cluster them.
leaflet(quakes) %>% addTiles() %>% addMarkers(~long, ~lat, clusterOptions = markerClusterOptions())Circle Markers
Circle markers are much like regular circles (see Lines and Shapes), except that their radius in onscreen pixels stays constant regardless of zoom level
leaflet(quakes[1:10,]) %>% addTiles() %>% addCircleMarkers()Assuming "long" and "lat" are longitude and latitude, respectively
Popups
Popups are small boxes containing arbitrary HTML, that point to a specific point on the map. Use the addPopups() function to add standalone popup to the map.
content <- paste(sep = "<br/>", "<b>Samurai Noodle</b>",
"606 5th Ave. S",
"Seattle, WA 98138"
)
leaflet() %>% addTiles() %>%
addPopups(-122.327298, 47.597131, content)#adding a link
content <- paste(sep = "<br/>",
"<b><a href='http://www.samurainoodle.com'>Samurai Noodle</a></b>",
"606 5th Ave. S",
"Seattle, WA 98138"
)
leaflet() %>% addTiles() %>%
addPopups(-122.327298, 47.597131, content,
options = popupOptions(closeButton = FALSE)
)Circles
Circles are added using addCircles(). Circles are similar to circle markers; the only difference is that circles have their radii specified in meters, while circle markers are specified in pixels. As a result, circles are scaled with the map as the user zooms in and out, while circle markers remain a constant size on the screen regardless of zoom level.
df<-data.frame(City=c("Boston","Hartford","New York City","Philadelphia"),
Lat=c(42.3601,41.7627,40.7127,39.9500),
Long=c(-71.0589,-72.6743,-74.0059,-75.1667),
Pop=c(645966,125017,8406000,1553000))
head(df) City Lat Long Pop
1 Boston 42.3601 -71.0589 645966
2 Hartford 41.7627 -72.6743 125017
3 New York City 40.7127 -74.0059 8406000
4 Philadelphia 39.9500 -75.1667 1553000
n1<-leaflet(df) %>% addTiles() %>% addCircles(lng = ~Long, lat = ~Lat)
n2<-leaflet(df) %>% addTiles() %>% addCircles(lng = ~Long, lat = ~Lat, weight = 1,
radius = ~sqrt(Pop)*20, popup = ~City)
n1;n2Cloropeth map
NAME centskWh
1 Alabama 9.63
2 Alaska 19.36
3 Arizona 10.85
4 Arkansas 7.78
5 California 16.58
6 Colorado 10.02
Linking to GEOS 3.11.2, GDAL 3.7.2, PROJ 9.3.0; sf_use_s2() is TRUE
[1] TRUE
Line plots
Readings
Tidyverse website:https://www.tidyverse.org/Links to an external site.
R for Data Science (The introduction to R for Data Science by Garrett Grolemund and Hadley Wickham.)
Introduction: https://r4ds.had.co.nz/introduction.htmlLinks to an external site.
Data Import Cheat Sheet (This Cheat Sheet was created by the RStudio team, and is hosted on Garrett Grolemund’s Github page.)
https://github.com/rstudio/cheatsheets/blob/master/data-import.pdfLinks to an external site.